Extract the data.

#Set training data and test data
train = read.csv("train.csv", stringsAsFactors = F)
test = read.csv("test.csv", stringsAsFactors = F)

Histogram/Plot and any additional exploratory analysis of the attributes.

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.3
hist(train$MSSubClass)

hist(test$MSSubClass)

boxplot(train$MSSubClass)

boxplot(test$MSSubClass)

train1 <- data.frame(train$MSZoning)
ggplot(train1) + geom_bar(aes(x=train$MSZoning))

test1 <- data.frame(test$MSZoning)
ggplot(test1) + geom_bar(aes(x=test$MSZoning))

hist(train$LotFrontage)

hist(test$LotFrontage)

boxplot(train$LotFrontage)

boxplot(test$LotFrontage)

median(train$LotFrontage, na.rm= TRUE)
## [1] 69
median(test$LotFrontage, na.rm= TRUE)
## [1] 67
hist(train$LotArea, xlim = c(0,55000))

hist(test$LotArea, xlim = c(0,55000))

boxplot(train$LotArea)

boxplot(test$LotArea)

summary(train$LotArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7554    9478   10517   11602  215245
summary(test$LotArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1470    7391    9399    9819   11518   56600
train2 <- data.frame(train$Street)
ggplot(train2) + geom_bar(aes(x=train$Street))

test2 <- data.frame(test$Street)
ggplot(test2) + geom_bar(aes(x=test$Street))

train3 <- data.frame(train$Alley)
ggplot(train3) + geom_bar(aes(x=train$Alley))

test3<- data.frame(test$Alley)
ggplot(test3) + geom_bar(aes(x=test$Alley)) 

train4 <- data.frame(train$LotShape)
ggplot(train4) + geom_bar(aes(x=train$LotShape))

test4<- data.frame(test$LotShape)
ggplot(test4) + geom_bar(aes(x=test$LotShape)) 

train5 <- data.frame(train$LandContour)
ggplot(train5) + geom_bar(aes(x=train$LandContour))

test5<- data.frame(test$LandContour)
ggplot(test5) + geom_bar(aes(x=test$LandContour)) 

train6 <- data.frame(train$Utilities)
ggplot(train6) + geom_bar(aes(x=train$Utilities))

test6<- data.frame(test$Utilities)
ggplot(test6) + geom_bar(aes(x=test$Utilities)) 

train6 <- data.frame(train$LotConfig)
ggplot(train6) + geom_bar(aes(x=train$LotConfig))

test6<- data.frame(test$LotConfig)
ggplot(test6) + geom_bar(aes(x=test$LotConfig))

train6 <- data.frame(train$LandSlope)
ggplot(train6) + geom_bar(aes(x=train$LandSlope))

test6<- data.frame(test$LandSlope)
ggplot(test6) + geom_bar(aes(x=test$LandSlope))

train6 <- data.frame(train$Neighborhood)
ggplot(train6) + geom_bar(aes(x=train$Neighborhood))

test6<- data.frame(test$Neighborhood)
ggplot(test6) + geom_bar(aes(x=test$Neighborhood))

train6 <- data.frame(train$Condition1)
ggplot(train6) + geom_bar(aes(x=train$Condition1))

test6<- data.frame(test$Condition1)
ggplot(test6) + geom_bar(aes(x=test$Condition1))

train6 <- data.frame(train$Condition2)
ggplot(train6) + geom_bar(aes(x=train$Condition2))

test6<- data.frame(test$Condition2)
ggplot(test6) + geom_bar(aes(x=test$Condition2))

train6 <- data.frame(train$BldgType)
ggplot(train6) + geom_bar(aes(x=train$BldgType))

test6<- data.frame(test$BldgType)
ggplot(test6) + geom_bar(aes(x=test$BldgType))

train6 <- data.frame(train$HouseStyle)
ggplot(train6) + geom_bar(aes(x=train$HouseStyle))

test6<- data.frame(test$HouseStyle)
ggplot(test6) + geom_bar(aes(x=test$HouseStyle))

train6 <- data.frame(train$OverallQual)
ggplot(train6) + geom_bar(aes(x=train$OverallQual))

test6<- data.frame(test$OverallQual)
ggplot(test6) + geom_bar(aes(x=test$OverallQual))

train6 <- data.frame(train$OverallCond)
ggplot(train6) + geom_bar(aes(x=train$OverallCond))

test6<- data.frame(test$OverallCond)
ggplot(test6) + geom_bar(aes(x=test$OverallCond))

hist(train$YearBuilt)

hist(test$YearBuilt)

hist(train$YearRemodAdd)

hist(test$YearRemodAdd)

train6 <- data.frame(train$RoofStyle)
ggplot(train6) + geom_bar(aes(x=train$RoofStyle))

test6<- data.frame(test$RoofStyle)
ggplot(test6) + geom_bar(aes(x=test$RoofStyle))

train6 <- data.frame(train$RoofMatl)
ggplot(train6) + geom_bar(aes(x=train$RoofMatl))

test6<- data.frame(test$RoofMatl)
ggplot(test6) + geom_bar(aes(x=test$RoofMatl))

test6<- data.frame(test$RoofStyle)
ggplot(test6) + geom_bar(aes(x=test$RoofStyle))

test6<- data.frame(train$RoofStyle)
ggplot(test6) + geom_bar(aes(x=train$RoofStyle))

test6<- data.frame(train$Exterior1st)
ggplot(test6) + geom_bar(aes(x=train$Exterior1st))

test6<- data.frame(test$Exterior1st)
ggplot(test6) + geom_bar(aes(x=test$Exterior1st))

test6<- data.frame(train$Exterior2nd)
ggplot(test6) + geom_bar(aes(x=train$Exterior2nd))

test6<- data.frame(test$Exterior1st)
ggplot(test6) + geom_bar(aes(x=test$Exterior2nd))

test6<- data.frame(train$MasVnrType)
ggplot(test6) + geom_bar(aes(x=train$MasVnrType))

test6<- data.frame(test$MasVnrType)
ggplot(test6) + geom_bar(aes(x=test$MasVnrType))

hist(train$MasVnrArea)

hist(test$MasVnrArea)

test6<- data.frame(train$ExterQual)
ggplot(test6) + geom_bar(aes(x=train$ExterQual))

test6<- data.frame(test$ExterQual)
ggplot(test6) + geom_bar(aes(x=test$ExterQual))

test6<- data.frame(train$ExterCond)
ggplot(test6) + geom_bar(aes(x=train$ExterCond))

test6<- data.frame(test$ExterCond)
ggplot(test6) + geom_bar(aes(x=test$ExterCond))

test6<- data.frame(train$Foundation)
ggplot(test6) + geom_bar(aes(x=train$Foundation))

test6<- data.frame(test$ExterCond)
ggplot(test6) + geom_bar(aes(x=test$Foundation))

ggplot(data.frame(train$BsmtQual)) + geom_bar(aes(x=train$BsmtQual))

ggplot(data.frame(test$BsmtQual)) + geom_bar(aes(x=test$BsmtQual))

ggplot(data.frame(train$BsmtCond)) + geom_bar(aes(x=train$BsmtCond))

ggplot(data.frame(test$BsmtCond)) + geom_bar(aes(x=test$BsmtCond))

ggplot(data.frame(train$BsmtExposure)) + geom_bar(aes(x=train$BsmtExposure))

ggplot(data.frame(test$BsmtExposure)) + geom_bar(aes(x=test$BsmtExposure))

ggplot(data.frame(train$BsmtFinType1)) + geom_bar(aes(x=train$BsmtFinType1))

ggplot(data.frame(test$BsmtFinType1)) + geom_bar(aes(x=test$BsmtFinType1))

hist(train$BsmtFinSF1)

hist(test$BsmtFinSF1)

ggplot(data.frame(train$BsmtFinType2)) + geom_bar(aes(x=train$BsmtFinType2))

ggplot(data.frame(test$BsmtFinType2)) + geom_bar(aes(x=test$BsmtFinType2))

hist(train$BsmtFinSF2)

hist(test$BsmtFinSF2)

hist(train$BsmtUnfSF)

hist(test$BsmtUnfSF)

hist(train$TotalBsmtSF)

hist(test$TotalBsmtSF)

ggplot(data.frame(train$Heating)) + geom_bar(aes(x=train$Heating))

ggplot(data.frame(test$Heating)) + geom_bar(aes(x=test$Heating))

ggplot(data.frame(train$HeatingQC)) + geom_bar(aes(x=train$HeatingQC))

ggplot(data.frame(test$HeatingQC)) + geom_bar(aes(x=test$HeatingQC))

ggplot(data.frame(train$CentralAir)) + geom_bar(aes(x=train$CentralAir))

ggplot(data.frame(test$CentralAir)) + geom_bar(aes(x=test$CentralAir))

ggplot(data.frame(train$CentralAir)) + geom_bar(aes(x=train$CentralAir))

ggplot(data.frame(test$CentralAir)) + geom_bar(aes(x=test$CentralAir))

ggplot(data.frame(train$Electrical)) + geom_bar(aes(x=train$Electrical))

ggplot(data.frame(test$Electrical)) + geom_bar(aes(x=test$Electrical))

hist(train$X1stFlrSF)

hist(test$X1stFlrSF)

mean(train$X1stFlrSF)
## [1] 1162.627
mean(test$X1stFlrSF)
## [1] 1156.535
hist(train$X2ndFlrSF)

hist(test$X2ndFlrSF)

mean(train$X2ndFlrSF)
## [1] 346.9925
mean(test$X2ndFlrSF)
## [1] 325.9678
hist(train$LowQualFinSF)

hist(test$LowQualFinSF)

mean(train$LowQualFinSF)
## [1] 5.844521
mean(test$LowQualFinSF)
## [1] 3.543523
hist(train$GrLivArea)

hist(test$GrLivArea)

mean(train$GrLivArea)
## [1] 1515.464
mean(test$GrLivArea)
## [1] 1486.046
hist(train$BsmtFullBath)

hist(test$BsmtFullBath)

hist(train$BsmtHalfBath)

hist(test$BsmtHalfBath)

hist(train$BedroomAbvGr)

hist(test$Bedroom)

hist(train$KitchenAbvGr)

hist(test$KitchenAbvGr)

ggplot(data.frame(train$KitchenQual)) + geom_bar(aes(x=train$KitchenQual))

ggplot(data.frame(test$KitchenQual)) + geom_bar(aes(x=test$KitchenQual))

hist(train$TotRmsAbvGrd)

hist(test$TotRmsAbvGrd)

ggplot(data.frame(train$Functional)) + geom_bar(aes(x=train$Functional))

ggplot(data.frame(test$Functional)) + geom_bar(aes(x=test$Functional))

hist(train$Fireplaces)

hist(test$Fireplaces)

ggplot(data.frame(train$FireplaceQu)) + geom_bar(aes(x=train$FireplaceQu))

ggplot(data.frame(test$FireplaceQu)) + geom_bar(aes(x=test$FireplaceQu))

ggplot(data.frame(train$GarageType)) + geom_bar(aes(x=train$GarageType))

ggplot(data.frame(test$GarageType)) + geom_bar(aes(x=test$GarageType))

hist(train$GarageYrBlt)

hist(test$GarageYrBlt)

ggplot(data.frame(train$GarageFinish)) + geom_bar(aes(x=train$GarageFinish))

ggplot(data.frame(test$GarageFinish)) + geom_bar(aes(x=test$GarageFinish))

hist(train$GarageArea)

hist(test$GarageArea)

mean(train$GarageArea)
## [1] 472.9801
mean(test$GarageArea)
## [1] NA
ggplot(data.frame(train$GarageQual)) + geom_bar(aes(x=train$GarageQual))

ggplot(data.frame(test$GarageQual)) + geom_bar(aes(x=test$GarageQual))

ggplot(data.frame(train$GarageCond)) + geom_bar(aes(x=train$GarageCond))

ggplot(data.frame(test$GarageCond)) + geom_bar(aes(x=test$GarageCond))

ggplot(data.frame(train$PavedDrive)) + geom_bar(aes(x=train$PavedDrive))

ggplot(data.frame(test$PavedDrive)) + geom_bar(aes(x=test$PavedDrive))

hist(train$WoodDeckSF)

hist(test$WoodDeckSF)

hist(train$OpenPorchSF)

hist(test$OpenPorchSF)

hist(train$EnclosedPorch)

hist(test$EnclosedPorch)

hist(train$X3SsnPorch)

hist(test$X3SsnPorch)

hist(train$ScreenPorch)

hist(test$ScreenPorch)

hist(train$PoolArea)

hist(test$PoolArea)

ggplot(data.frame(train$PoolQC)) + geom_bar(aes(x=train$PoolQC))

ggplot(data.frame(test$PoolQC)) + geom_bar(aes(x=test$PoolQC))

ggplot(data.frame(train$Fence)) + geom_bar(aes(x=train$Fence))

ggplot(data.frame(test$Fence)) + geom_bar(aes(x=test$Fence))

ggplot(data.frame(train$MiscFeature)) + geom_bar(aes(x=train$MiscFeature))

ggplot(data.frame(test$MiscFeature)) + geom_bar(aes(x=test$MiscFeature))

hist(train$MiscVal)

hist(test$MiscVal)

mean(train$MiscVal)
## [1] 43.48904
mean(test$MiscVal)
## [1] 58.16792
hist(train$MoSold)

hist(test$MoSold)

hist(train$YrSold)

hist(test$YrSold)

ggplot(data.frame(train$SaleType)) + geom_bar(aes(x=train$SaleType))

ggplot(data.frame(test$SaleType)) + geom_bar(aes(x=test$SaleType))

ggplot(data.frame(train$SaleCondition)) + geom_bar(aes(x=train$SaleCondition))

ggplot(data.frame(test$SaleCondition)) + geom_bar(aes(x=test$SaleCondition))

Dimensions of the dataset for both the train and test dataset.

#Get dimensions of the train dataset
#There are 1,460 rows
nrow(train)
## [1] 1460
#There are 81 columns
ncol(train)
## [1] 81
#Get dimensions of the test data
#There are 1,459 rows in the test data
nrow(test)
## [1] 1459
#There are 80 columns in the test data
ncol(test)
## [1] 80

Determine features/variables that are categorical and numerical in nature in the train dataset

categoricalTrain <- names(which(sapply(train, class) == "character"))

#43 features are categorical in nature
length(categoricalTrain)
## [1] 43
categoricalTrain
##  [1] "MSZoning"      "Street"        "Alley"         "LotShape"     
##  [5] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
##  [9] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [13] "HouseStyle"    "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [17] "Exterior2nd"   "MasVnrType"    "ExterQual"     "ExterCond"    
## [21] "Foundation"    "BsmtQual"      "BsmtCond"      "BsmtExposure" 
## [25] "BsmtFinType1"  "BsmtFinType2"  "Heating"       "HeatingQC"    
## [29] "CentralAir"    "Electrical"    "KitchenQual"   "Functional"   
## [33] "FireplaceQu"   "GarageType"    "GarageFinish"  "GarageQual"   
## [37] "GarageCond"    "PavedDrive"    "PoolQC"        "Fence"        
## [41] "MiscFeature"   "SaleType"      "SaleCondition"
print("   ")
## [1] "   "
#Count the number of features that are numerical in the test dataset
numericalTrain <- names(which(sapply(train, class) != "character"))

#38 features are numerical in nature
length(numericalTrain)
## [1] 38
numericalTrain
##  [1] "Id"            "MSSubClass"    "LotFrontage"   "LotArea"      
##  [5] "OverallQual"   "OverallCond"   "YearBuilt"     "YearRemodAdd" 
##  [9] "MasVnrArea"    "BsmtFinSF1"    "BsmtFinSF2"    "BsmtUnfSF"    
## [13] "TotalBsmtSF"   "X1stFlrSF"     "X2ndFlrSF"     "LowQualFinSF" 
## [17] "GrLivArea"     "BsmtFullBath"  "BsmtHalfBath"  "FullBath"     
## [21] "HalfBath"      "BedroomAbvGr"  "KitchenAbvGr"  "TotRmsAbvGrd" 
## [25] "Fireplaces"    "GarageYrBlt"   "GarageCars"    "GarageArea"   
## [29] "WoodDeckSF"    "OpenPorchSF"   "EnclosedPorch" "X3SsnPorch"   
## [33] "ScreenPorch"   "PoolArea"      "MiscVal"       "MoSold"       
## [37] "YrSold"        "SalePrice"

Determine features/variables that are categorical and numerical in nature in the test dataset

categoricalTest <- names(which(sapply(test, class) == "character"))

#43 features are categorical in nature
length(categoricalTest)
## [1] 43
numericalTest <- names(which(sapply(test, class) != "character"))

#37 features are numerical in nature
length(numericalTest)
## [1] 37
#There is a discrepancy in the number of features between the test and train dataset
trainColumns <- colnames(train)
testColumns <- colnames(test)

#The result is the SalePrice attribute is missing in the test dataset but that is to be expected.
matchV <- match(trainColumns, testColumns)
head(train[is.na(matchV)])

It can be noted that the following categorical features are noted with “NA” but it does not necessarily mean that the feature has an invalid value due to error but it actually does not possess the physical feature.

print("Train dataset that have NA values")
## [1] "Train dataset that have NA values"
for (col in 1:ncol(train)) {
  x<- colnames(train[col])
  y<- sum(is.na(train[col]))
  if (y > 0) {
    print(paste(x,y))
  }
}
## [1] "LotFrontage 259"
## [1] "Alley 1369"
## [1] "MasVnrType 8"
## [1] "MasVnrArea 8"
## [1] "BsmtQual 37"
## [1] "BsmtCond 37"
## [1] "BsmtExposure 38"
## [1] "BsmtFinType1 37"
## [1] "BsmtFinType2 38"
## [1] "Electrical 1"
## [1] "FireplaceQu 690"
## [1] "GarageType 81"
## [1] "GarageYrBlt 81"
## [1] "GarageFinish 81"
## [1] "GarageQual 81"
## [1] "GarageCond 81"
## [1] "PoolQC 1453"
## [1] "Fence 1179"
## [1] "MiscFeature 1406"
print("Test data that have NA values")
## [1] "Test data that have NA values"
for (col in 1:ncol(test)) {
  x<- colnames(test[col])
  y<- sum(is.na(test[col]))
  if (y > 0) {
    print(paste(x,y))
  }
}
## [1] "MSZoning 4"
## [1] "LotFrontage 227"
## [1] "Alley 1352"
## [1] "Utilities 2"
## [1] "Exterior1st 1"
## [1] "Exterior2nd 1"
## [1] "MasVnrType 16"
## [1] "MasVnrArea 15"
## [1] "BsmtQual 44"
## [1] "BsmtCond 45"
## [1] "BsmtExposure 44"
## [1] "BsmtFinType1 42"
## [1] "BsmtFinSF1 1"
## [1] "BsmtFinType2 42"
## [1] "BsmtFinSF2 1"
## [1] "BsmtUnfSF 1"
## [1] "TotalBsmtSF 1"
## [1] "BsmtFullBath 2"
## [1] "BsmtHalfBath 2"
## [1] "KitchenQual 1"
## [1] "Functional 2"
## [1] "FireplaceQu 730"
## [1] "GarageType 76"
## [1] "GarageYrBlt 78"
## [1] "GarageFinish 78"
## [1] "GarageCars 1"
## [1] "GarageArea 1"
## [1] "GarageQual 78"
## [1] "GarageCond 78"
## [1] "PoolQC 1456"
## [1] "Fence 1169"
## [1] "MiscFeature 1408"
## [1] "SaleType 1"

To combat this issue, we will replace the “NA” in these to their actual value based on the data description text file.

train$Alley[is.na(train$Alley)] = "No alley access"
test$Alley[is.na(test$Alley)] = "No alley access"

train$BsmtQual[is.na(train$BsmtQual)] = "No Basement"
test$BsmtQual[is.na(test$BsmtQual)] = "No Basement"

train$BsmtCond[is.na(train$BsmtCond)] = "No Basement"
test$BsmtCond[is.na(test$BsmtCond)] = "No Basement"

train$BsmtExposure[is.na(train$BsmtExposure)] = "No Basement"
test$BsmtExposure[is.na(test$BsmtExposure)] = "No Basement"

train$BsmtFinType1[is.na(train$BsmtFinType1)] = "No Basement"
test$BsmtFinType1[is.na(test$BsmtFinType1)] = "No Basement"

train$BsmtFinType2[is.na(train$BsmtFinType2)] = "No Basement"
test$BsmtFinType2[is.na(test$BsmtFinType2)] = "No Basement"

train$FireplaceQu[is.na(train$FireplaceQu)] = "No Fireplace"
test$FireplaceQu[is.na(test$FireplaceQu)] = "No Fireplace"

train$GarageType[is.na(train$GarageType)] = "No Garage"
test$GarageType[is.na(test$GarageType)] = "No Garage"

train$GarageFinish[is.na(train$GarageFinish)] = "No Garage"
test$GarageFinish[is.na(test$GarageFinish)] = "No Garage"

train$GarageQual[is.na(train$GarageQual)] = "No Garage"
test$GarageQual[is.na(test$GarageQual)] = "No Garage"

train$GarageCond[is.na(train$GarageCond)] = "No Garage"
test$GarageCond[is.na(test$GarageCond)] = "No Garage"

train$PoolQC[is.na(train$PoolQC)] = "No Pool"
test$PoolQC[is.na(test$PoolQC)] = "No Pool"

train$Fence[is.na(train$Fence)] = "No Fence"
test$Fence[is.na(test$Fence)] = "No Fence"

train$MiscFeature[is.na(train$MiscFeature)] = "None"
test$MiscFeature[is.na(test$MiscFeature)] = "None"

With the changed values, the “NA” values remaining are numerical or categorical values that are not detailed in the data description text file.

print("Train Features that have NA")
## [1] "Train Features that have NA"
for (col in 1:ncol(train)) {
  x<- colnames(train[col])
  y<- sum(is.na(train[col]))
  if (y > 0) {
    print(paste(x,y))
  }
}
## [1] "LotFrontage 259"
## [1] "MasVnrType 8"
## [1] "MasVnrArea 8"
## [1] "Electrical 1"
## [1] "GarageYrBlt 81"
print("Test Features that have NA")
## [1] "Test Features that have NA"
for (col in 1:ncol(test)) {
  x<- colnames(test[col])
  y<- sum(is.na(test[col]))
  if (y > 0) {
    print(paste(x,y))
  }
}
## [1] "MSZoning 4"
## [1] "LotFrontage 227"
## [1] "Utilities 2"
## [1] "Exterior1st 1"
## [1] "Exterior2nd 1"
## [1] "MasVnrType 16"
## [1] "MasVnrArea 15"
## [1] "BsmtFinSF1 1"
## [1] "BsmtFinSF2 1"
## [1] "BsmtUnfSF 1"
## [1] "TotalBsmtSF 1"
## [1] "BsmtFullBath 2"
## [1] "BsmtHalfBath 2"
## [1] "KitchenQual 1"
## [1] "Functional 2"
## [1] "GarageYrBlt 78"
## [1] "GarageCars 1"
## [1] "GarageArea 1"
## [1] "SaleType 1"

These following categorical variables still have NA values.

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## v purrr   0.3.4
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
categoricalTestNA <- test %>% select_if(is.character)

print("Test Categorical Features")
## [1] "Test Categorical Features"
for (col in 1:ncol(categoricalTestNA)) {
  x<- colnames(categoricalTestNA[col])
  y<- sum(is.na(categoricalTestNA[col]))
  if (y > 0) {
    print(paste(x,y))
  }
}
## [1] "MSZoning 4"
## [1] "Utilities 2"
## [1] "Exterior1st 1"
## [1] "Exterior2nd 1"
## [1] "MasVnrType 16"
## [1] "KitchenQual 1"
## [1] "Functional 2"
## [1] "SaleType 1"
categoricalTrainNA <- train %>% select_if(is.character)

print("Train Categorical Features")
## [1] "Train Categorical Features"
for (col in 1:ncol(categoricalTrainNA)) {
  x<- colnames(categoricalTrainNA[col])
  y<- sum(is.na(categoricalTrainNA[col]))
  if (y > 0) {
    print(paste(x,y))
  }
}
## [1] "MasVnrType 8"
## [1] "Electrical 1"

For the following categorical features that have NA, replace with the mode value.

#mode function
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max((tabulate(match(x, ux))))]
}

train$MasVnrType[is.na(train$MasVnrType)] = Mode(train$MasVnrType)
train$Electrical[is.na(train$Electrical)] = Mode(train$Electrical)

test$MSZoning[is.na(test$MSZoning)] = Mode(test$MSZoning)
test$Utilities[is.na(test$Utilities)] = Mode(test$Utilities)
test$Exterior1st[is.na(test$Exterior1st)] = Mode(test$Exterior1st)
test$Exterior2nd[is.na(test$Exterior2nd)] = Mode(test$Exterior2nd)
test$MasVnrType[is.na(test$MasVnrType)] = Mode(test$MasVnrType)
test$KitchenQual[is.na(test$KitchenQual)] = Mode(test$KitchenQual)
test$Functional[is.na(test$Functional)] = Mode(test$Functional)
test$SaleType[is.na(test$SaleType)] = Mode(test$SaleType)

For the numerical features, it seems plausible that a “NA” dictate that it is in fact 0. This links directly with the fact that some houses may not have a basement, alley, garage, fence, and etc.

#These are the numerical features left with NAs
numericalTestNA <- test %>% select_if(is.numeric)

print("Test Numerical Features")
## [1] "Test Numerical Features"
for (col in 1:ncol(numericalTestNA)) {
  x<- colnames(numericalTestNA[col])
  y<- sum(is.na(numericalTestNA[col]))
  if (y > 0) {
    print(paste(x,y))
  }
}
## [1] "LotFrontage 227"
## [1] "MasVnrArea 15"
## [1] "BsmtFinSF1 1"
## [1] "BsmtFinSF2 1"
## [1] "BsmtUnfSF 1"
## [1] "TotalBsmtSF 1"
## [1] "BsmtFullBath 2"
## [1] "BsmtHalfBath 2"
## [1] "GarageYrBlt 78"
## [1] "GarageCars 1"
## [1] "GarageArea 1"
numericalTrainNA <- train %>% select_if(is.numeric)

print("Train Numerical Features")
## [1] "Train Numerical Features"
for (col in 1:ncol(numericalTrainNA)) {
  x<- colnames(numericalTrainNA[col])
  y<- sum(is.na(numericalTrainNA[col]))
  if (y > 0) {
    print(paste(x,y))
  }
}
## [1] "LotFrontage 259"
## [1] "MasVnrArea 8"
## [1] "GarageYrBlt 81"

To address this issue, we will assign 0 to the features that contain “Bsmt” and “Garage”.

test$BsmtFinSF1[is.na(test$BsmtFinSF1)] = 0
test$BsmtFinSF2[is.na(test$BsmtFinSF2)] = 0
test$BsmtUnfSF[is.na(test$BsmtUnfSF)] = 0
test$TotalBsmtSF[is.na(test$TotalBsmtSF)] = 0
test$BsmtFullBath[is.na(test$BsmtFullBath)] = 0
test$BsmtHalfBath[is.na(test$BsmtHalfBath)] = 0
test$GarageCars[is.na(test$GarageCars)] = 0
test$GarageArea[is.na(test$GarageArea)] = 0

Secondly, for the GarageYrBlt and MasVnrArea, putting a 0 wouldn’t make sense. There are 78 rows that do not have a year assigned that could actually have a garage and the for MasVnrArea, there is some areas just not being report for the masonry veneer area because info was lacking. Instead of removing the rows completely (effectively removing a minimum of 78 rows and up to a maximum of 86 rows), we can assign a value of -1 that assign a non-logical value and can help identify that these are outliers.

test$GarageYrBlt[is.na(test$GarageYrBlt)] = -1
test$MasVnrArea[is.na(test$MasVnrArea)] = -1
train$MasVnrArea[is.na(train$MasVnrArea)] = -1
train$GarageYrBlt[is.na(train$GarageYrBlt)] = -1

For the case of the missing LotFrontage area, we can assume there are outliers present and so a median approach can be used for these missing values

test$LotFrontage[is.na(test$LotFrontage)] = median(test$LotFrontage, na.rm = TRUE)
train$LotFrontage[is.na(train$LotFrontage)] = median(train$LotFrontage, na.rm = TRUE)

Corrplot to see what is plotted in relevance

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.3
## corrplot 0.84 loaded
library(tidyverse)
library(dplyr)
 

numValues <- select_if(train, is.numeric)
trainCor <- cor(numValues)
corrplot(trainCor, method = "shade")

threshold <- 0.5
cor_filter <- trainCor
diag(cor_filter) <- 0

filter <- apply(cor_filter,1, function(x) sum(abs(x) >= threshold))

sel <- filter
cor_final <- cor_filter[sel, sel]
corrplot(cor_final, method = "color")

For the purposes of forming a linear regression model, the categorical attributes most be assigned a number value instead of a character value.

#Perform all the changes on a second duplicate data frame
modifiedTest <- test
modifiedTrain <- train

#Modifying the values to the number values; they are assigned alphabetically
modifiedTest$MSZoning <- as.numeric(factor(modifiedTest$MSZoning))
modifiedTest$Street <- as.numeric(factor(modifiedTest$Street))
modifiedTest$Alley <- as.numeric(factor(modifiedTest$Alley))
modifiedTest$LotShape <- as.numeric(factor(modifiedTest$LotShape))
modifiedTest$LandContour <- as.numeric(factor(modifiedTest$LandContour))
modifiedTest$Utilities <- as.numeric(factor(modifiedTest$Utilities))
modifiedTest$LotConfig <- as.numeric(factor(modifiedTest$LotConfig))
modifiedTest$LandSlope <- as.numeric(factor(modifiedTest$LandSlope))
modifiedTest$Neighborhood <- as.numeric(factor(modifiedTest$Neighborhood))
modifiedTest$Condition1 <- as.numeric(factor(modifiedTest$Condition1))
modifiedTest$Condition2 <- as.numeric(factor(modifiedTest$Condition2))
modifiedTest$BldgType <- as.numeric(factor(modifiedTest$BldgType))
modifiedTest$HouseStyle <- as.numeric(factor(modifiedTest$HouseStyle))
modifiedTest$RoofStyle <- as.numeric(factor(modifiedTest$RoofStyle))
modifiedTest$RoofMatl <- as.numeric(factor(modifiedTest$RoofMatl))
modifiedTest$Exterior1st <- as.numeric(factor(modifiedTest$Exterior1st))
modifiedTest$Exterior2nd <- as.numeric(factor(modifiedTest$Exterior2nd))
modifiedTest$MasVnrType <- as.numeric(factor(modifiedTest$MasVnrType))
modifiedTest$ExterQual <- as.numeric(factor(modifiedTest$ExterQual))
modifiedTest$ExterCond <- as.numeric(factor(modifiedTest$ExterCond))
modifiedTest$Foundation <- as.numeric(factor(modifiedTest$Foundation))
modifiedTest$BsmtQual <- as.numeric(factor(modifiedTest$BsmtQual))
modifiedTest$BsmtCond <- as.numeric(factor(modifiedTest$BsmtCond))
modifiedTest$BsmtExposure <- as.numeric(factor(modifiedTest$BsmtExposure))
modifiedTest$BsmtFinType1 <- as.numeric(factor(modifiedTest$BsmtFinType1))
modifiedTest$BsmtFinType2 <- as.numeric(factor(modifiedTest$BsmtFinType2))
modifiedTest$Heating <- as.numeric(factor(modifiedTest$Heating))
modifiedTest$HeatingQC <- as.numeric(factor(modifiedTest$HeatingQC))
modifiedTest$CentralAir <- as.numeric(factor(modifiedTest$CentralAir))
modifiedTest$Electrical <- as.numeric(factor(modifiedTest$Electrical))
modifiedTest$KitchenQual <- as.numeric(factor(modifiedTest$KitchenQual))
modifiedTest$Functional <- as.numeric(factor(modifiedTest$Functional))
modifiedTest$FireplaceQu <- as.numeric(factor(modifiedTest$FireplaceQu))
modifiedTest$GarageType <- as.numeric(factor(modifiedTest$GarageType))
modifiedTest$GarageFinish <- as.numeric(factor(modifiedTest$GarageFinish))
modifiedTest$GarageQual <- as.numeric(factor(modifiedTest$GarageQual))
modifiedTest$GarageCond <- as.numeric(factor(modifiedTest$GarageCond))
modifiedTest$PavedDrive <- as.numeric(factor(modifiedTest$PavedDrive))
modifiedTest$PoolQC <- as.numeric(factor(modifiedTest$PoolQC))
modifiedTest$Fence <- as.numeric(factor(modifiedTest$Fence))
modifiedTest$MiscFeature <- as.numeric(factor(modifiedTest$MiscFeature))
modifiedTest$SaleType <- as.numeric(factor(modifiedTest$SaleType))
modifiedTest$SaleCondition <- as.numeric(factor(modifiedTest$SaleCondition))

modifiedTrain$MSZoning <- as.numeric(factor(modifiedTrain$MSZoning))
modifiedTrain$Street <- as.numeric(factor(modifiedTrain$Street))
modifiedTrain$Alley <- as.numeric(factor(modifiedTrain$Alley))
modifiedTrain$LotShape <- as.numeric(factor(modifiedTrain$LotShape))
modifiedTrain$LandContour <- as.numeric(factor(modifiedTrain$LandContour))
modifiedTrain$Utilities <- as.numeric(factor(modifiedTrain$Utilities))
modifiedTrain$LotConfig <- as.numeric(factor(modifiedTrain$LotConfig))
modifiedTrain$LandSlope <- as.numeric(factor(modifiedTrain$LandSlope))
modifiedTrain$Neighborhood <- as.numeric(factor(modifiedTrain$Neighborhood))
modifiedTrain$Condition1 <- as.numeric(factor(modifiedTrain$Condition1))
modifiedTrain$Condition2 <- as.numeric(factor(modifiedTrain$Condition2))
modifiedTrain$BldgType <- as.numeric(factor(modifiedTrain$BldgType))
modifiedTrain$HouseStyle <- as.numeric(factor(modifiedTrain$HouseStyle))
modifiedTrain$RoofStyle <- as.numeric(factor(modifiedTrain$RoofStyle))
modifiedTrain$RoofMatl <- as.numeric(factor(modifiedTrain$RoofMatl))
modifiedTrain$Exterior1st <- as.numeric(factor(modifiedTrain$Exterior1st))
modifiedTrain$Exterior2nd <- as.numeric(factor(modifiedTrain$Exterior2nd))
modifiedTrain$MasVnrType <- as.numeric(factor(modifiedTrain$MasVnrType))
modifiedTrain$ExterQual <- as.numeric(factor(modifiedTrain$ExterQual))
modifiedTrain$ExterCond <- as.numeric(factor(modifiedTrain$ExterCond))
modifiedTrain$Foundation <- as.numeric(factor(modifiedTrain$Foundation))
modifiedTrain$BsmtQual <- as.numeric(factor(modifiedTrain$BsmtQual))
modifiedTrain$BsmtCond <- as.numeric(factor(modifiedTrain$BsmtCond))
modifiedTrain$BsmtExposure <- as.numeric(factor(modifiedTrain$BsmtExposure))
modifiedTrain$BsmtFinType1 <- as.numeric(factor(modifiedTrain$BsmtFinType1))
modifiedTrain$BsmtFinType2 <- as.numeric(factor(modifiedTrain$BsmtFinType2))
modifiedTrain$Heating <- as.numeric(factor(modifiedTrain$Heating))
modifiedTrain$HeatingQC <- as.numeric(factor(modifiedTrain$HeatingQC))
modifiedTrain$CentralAir <- as.numeric(factor(modifiedTrain$CentralAir))
modifiedTrain$Electrical <- as.numeric(factor(modifiedTrain$Electrical))
modifiedTrain$KitchenQual <- as.numeric(factor(modifiedTrain$KitchenQual))
modifiedTrain$Functional <- as.numeric(factor(modifiedTrain$Functional))
modifiedTrain$FireplaceQu <- as.numeric(factor(modifiedTrain$FireplaceQu))
modifiedTrain$GarageType <- as.numeric(factor(modifiedTrain$GarageType))
modifiedTrain$GarageFinish <- as.numeric(factor(modifiedTrain$GarageFinish))
modifiedTrain$GarageQual <- as.numeric(factor(modifiedTrain$GarageQual))
modifiedTrain$GarageCond <- as.numeric(factor(modifiedTrain$GarageCond))
modifiedTrain$PavedDrive <- as.numeric(factor(modifiedTrain$PavedDrive))
modifiedTrain$PoolQC <- as.numeric(factor(modifiedTrain$PoolQC))
modifiedTrain$Fence <- as.numeric(factor(modifiedTrain$Fence))
modifiedTrain$MiscFeature <- as.numeric(factor(modifiedTrain$MiscFeature))
modifiedTrain$SaleType <- as.numeric(factor(modifiedTrain$SaleType))
modifiedTrain$SaleCondition <- as.numeric(factor(modifiedTrain$SaleCondition))

For feature selection, we will use the Random Forest based on the Boruta algorithm.

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(100)
x <- modifiedTrain[1:1000,]
y<- modifiedTrain[1001:1451,]
rfImp <- randomForest(SalePrice ~., data = x, ntree = 500, importance = TRUE, na.action = na.roughfix)

importance(rfImp)
##                   %IncMSE IncNodePurity
## Id            -0.33724662  2.564736e+10
## MSSubClass     9.40657763  1.905665e+10
## MSZoning       8.98869457  1.024157e+10
## LotFrontage    5.19909463  5.041257e+10
## LotArea       11.06339283  1.295530e+11
## Street        -1.00100150  1.106835e+08
## Alley          3.02263337  1.290483e+09
## LotShape       2.58967033  6.885532e+09
## LandContour    0.13054549  9.691593e+09
## Utilities      0.00000000  1.963291e+06
## LotConfig      0.67618849  5.361574e+09
## LandSlope      2.98896791  8.458013e+09
## Neighborhood   9.33074734  4.003759e+10
## Condition1     1.02740124  3.889137e+09
## Condition2    -1.42452578  8.647074e+08
## BldgType       4.51208201  4.663291e+09
## HouseStyle     3.46535898  7.495093e+09
## OverallQual   23.40366716  1.541388e+12
## OverallCond    6.09570086  2.330339e+10
## YearBuilt     14.98798572  3.290240e+11
## YearRemodAdd   7.19150012  6.942981e+10
## RoofStyle      2.36801327  8.304531e+09
## RoofMatl      -2.30948584  1.203275e+10
## Exterior1st    5.42065871  1.551306e+10
## Exterior2nd    3.96007362  1.279377e+10
## MasVnrType     2.92117368  6.331765e+09
## MasVnrArea     5.80526822  7.276429e+10
## ExterQual      8.74813658  2.457379e+11
## ExterCond      1.24140494  3.964406e+09
## Foundation     3.94061910  6.122764e+09
## BsmtQual       6.25793799  1.335838e+11
## BsmtCond       0.55267747  2.386854e+09
## BsmtExposure   1.56410906  1.271892e+10
## BsmtFinType1   7.95466107  1.709935e+10
## BsmtFinSF1     6.95421001  1.512946e+11
## BsmtFinType2   1.64515276  3.596004e+09
## BsmtFinSF2     2.51958984  5.504324e+09
## BsmtUnfSF      5.06491772  3.479275e+10
## TotalBsmtSF   15.15309387  2.950302e+11
## Heating        0.17066201  1.404862e+09
## HeatingQC      5.34209808  5.924930e+09
## CentralAir     3.92472315  7.690464e+09
## Electrical    -0.36386615  1.602042e+09
## X1stFlrSF     13.82739428  2.556271e+11
## X2ndFlrSF     10.20407855  1.173135e+11
## LowQualFinSF  -1.35605770  1.395875e+09
## GrLivArea     33.05018382  9.188650e+11
## BsmtFullBath   3.22583630  1.261374e+10
## BsmtHalfBath   3.10975364  1.113360e+10
## FullBath       9.98561570  1.061800e+11
## HalfBath       7.47745831  1.021306e+10
## BedroomAbvGr   4.92004896  1.445991e+10
## KitchenAbvGr   5.41672243  9.237209e+09
## KitchenQual    6.00693127  9.629769e+10
## TotRmsAbvGrd   5.11237367  8.159518e+10
## Functional     1.91323527  3.539223e+09
## Fireplaces    10.45799148  4.703972e+10
## FireplaceQu    6.36361050  1.225761e+10
## GarageType    10.36822854  4.334701e+10
## GarageYrBlt    8.99796576  1.166383e+11
## GarageFinish   6.99874961  1.305272e+10
## GarageCars    12.39968619  6.834243e+11
## GarageArea    12.24536826  2.542581e+11
## GarageQual     1.62335910  5.723552e+09
## GarageCond     2.79419399  2.875112e+09
## PavedDrive     4.97894854  6.382699e+09
## WoodDeckSF     2.81724635  3.129951e+10
## OpenPorchSF    4.81242923  3.248856e+10
## EnclosedPorch  0.21952135  5.960823e+09
## X3SsnPorch     0.62408749  4.570850e+08
## ScreenPorch    0.02128592  1.458320e+10
## PoolArea       0.00000000  1.086433e+08
## PoolQC         0.00000000  2.340141e+07
## Fence          0.94509165  2.306340e+09
## MiscFeature   -2.01726723  4.423697e+08
## MiscVal       -1.56387341  7.444683e+08
## MoSold         0.81551435  3.563767e+10
## YrSold         0.14567719  1.175499e+10
## SaleType       2.39047908  7.388047e+09
## SaleCondition  1.86038809  1.105829e+10
varImpPlot(rfImp, n.var = 10)

varImp(rfImp)

Normalize the data. The Utilities attribute for the train dataset all contain 0 – it is not possible to normalize that column if they all have the same value.

normalize <- function(x) {
  return ((x-min(x)) / (max(x)-min(x)))
}

#Utilities was excluded as it can't be normalized since it has one-sided results in test
testNorm <- apply(modifiedTest[, c(2:9, 11:80)],2,function(x) (x-min(x)/(max(x)-min(x))))
trainNorm <- apply(modifiedTrain[, 2:81],2,function(x) (x-min(x)/(max(x)-min(x))))

Linear training model with K-fold cross-validation of 10-folds

library(MASS)
## Warning: package 'MASS' was built under R version 4.0.3
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.3
model_train <- lm(SalePrice ~ ., data = data.frame(trainNorm))
model_train
## 
## Call:
## lm(formula = SalePrice ~ ., data = data.frame(trainNorm))
## 
## Coefficients:
##   (Intercept)     MSSubClass       MSZoning    LotFrontage        LotArea  
##     1.338e+06     -1.125e+02     -1.285e+03     -1.605e+02      4.145e-01  
##        Street          Alley       LotShape    LandContour      Utilities  
##     3.126e+04      2.676e+03     -8.757e+02      3.150e+03     -5.410e+04  
##     LotConfig      LandSlope   Neighborhood     Condition1     Condition2  
##    -7.691e+00      5.267e+03      2.548e+02     -8.847e+02     -9.228e+03  
##      BldgType     HouseStyle    OverallQual    OverallCond      YearBuilt  
##    -2.868e+03     -1.078e+03      1.107e+04      5.338e+03      2.086e+02  
##  YearRemodAdd      RoofStyle       RoofMatl    Exterior1st    Exterior2nd  
##    -2.080e+01      2.090e+03      5.191e+03     -1.008e+03      3.864e+02  
##    MasVnrType     MasVnrArea      ExterQual      ExterCond     Foundation  
##     4.598e+03      3.346e+01     -1.008e+04      8.470e+02      8.812e+02  
##      BsmtQual       BsmtCond   BsmtExposure   BsmtFinType1     BsmtFinSF1  
##    -4.230e+03      1.323e+03     -3.482e+03     -6.409e+02      8.738e+00  
##  BsmtFinType2     BsmtFinSF2      BsmtUnfSF    TotalBsmtSF        Heating  
##     7.205e+02      1.138e+01      3.662e-01             NA     -2.269e+03  
##     HeatingQC     CentralAir     Electrical      X1stFlrSF      X2ndFlrSF  
##    -6.235e+02      7.358e+02     -5.148e+02      4.786e+01      4.324e+01  
##  LowQualFinSF      GrLivArea   BsmtFullBath   BsmtHalfBath       FullBath  
##    -1.769e+00             NA      6.341e+03     -4.110e+02      2.848e+03  
##      HalfBath   BedroomAbvGr   KitchenAbvGr    KitchenQual   TotRmsAbvGrd  
##    -2.931e+02     -3.868e+03     -1.264e+04     -8.397e+03      3.853e+03  
##    Functional     Fireplaces    FireplaceQu     GarageType    GarageYrBlt  
##     3.742e+03      4.997e+03     -1.550e+03      3.420e+02     -8.287e+00  
##  GarageFinish     GarageCars     GarageArea     GarageQual     GarageCond  
##    -8.922e+02      1.379e+04      9.837e-02     -9.608e+02      9.871e+02  
##    PavedDrive     WoodDeckSF    OpenPorchSF  EnclosedPorch     X3SsnPorch  
##     2.544e+03      2.215e+01     -2.948e+00     -1.275e+00      3.003e+01  
##   ScreenPorch       PoolArea         PoolQC          Fence    MiscFeature  
##     4.602e+01     -2.826e+02     -8.800e+04      7.638e+01     -3.059e+03  
##       MiscVal         MoSold         YrSold       SaleType  SaleCondition  
##     8.721e-02     -1.710e+02     -9.053e+02     -5.985e+02      3.180e+03
set.seed(123)
train.control <- trainControl(method = "repeatedcv", 
                              number = 10, repeats = 3)
# Train the model
model <- train(SalePrice ~., data = trainNorm, method = "lm",
               trControl = train.control)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
# Summarize the results
print(model)
## Linear Regression 
## 
## 1460 samples
##   79 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 1312, 1313, 1315, 1316, 1314, 1315, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   36126.82  0.7994177  20498.66
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Plot of linear regression with with all attributes.

predicted_pricesLM <- predict(model, newdata = modifiedTrain)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
 plot(predicted_pricesLM ,modifiedTrain$SalePrice,
      xlab="Predicted Prices",ylab="Actual Prices", main = "Linear Regression - All Attributes")
 abline(a=0,b=1)

Revised version with reduced feature selection that was selected via Random Forest

model_train <- lm(SalePrice ~ OverallQual+GrLivArea+GarageCars+YearBuilt+TotalBsmtSF+X1stFlrSF+GarageArea+ExterQual+BsmtFinSF1+BsmtQual, data = data.frame(trainNorm))
model_train
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + 
##     YearBuilt + TotalBsmtSF + X1stFlrSF + GarageArea + ExterQual + 
##     BsmtFinSF1 + BsmtQual, data = data.frame(trainNorm))
## 
## Coefficients:
## (Intercept)  OverallQual    GrLivArea   GarageCars    YearBuilt  TotalBsmtSF  
##  -2.308e+05    1.691e+04    4.639e+01    1.176e+04    1.292e+02    8.025e+00  
##   X1stFlrSF   GarageArea    ExterQual   BsmtFinSF1     BsmtQual  
##   1.137e+01    4.942e+00   -1.543e+04    2.079e+01   -4.999e+03
set.seed(123)
train.control <- trainControl(method = "repeatedcv", 
                              number = 10, repeats = 3)
# Train the model
model <- train(SalePrice ~ OverallQual+GrLivArea+GarageCars+YearBuilt+TotalBsmtSF+X1stFlrSF+GarageArea+ExterQual+BsmtFinSF1+BsmtQual, data = trainNorm, method = "lm",
               trControl = train.control)
# Summarize the results
print(model)
## Linear Regression 
## 
## 1460 samples
##   10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 1312, 1313, 1315, 1316, 1314, 1315, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   36210.37  0.7973232  22593.49
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(model)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -532092  -17106    -564   15012  260160 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.308e+05  8.968e+04  -2.574  0.01016 *  
## OverallQual  1.691e+04  1.177e+03  14.375  < 2e-16 ***
## GrLivArea    4.638e+01  2.578e+00  17.992  < 2e-16 ***
## GarageCars   1.176e+04  2.900e+03   4.055 5.28e-05 ***
## YearBuilt    1.292e+02  4.542e+01   2.845  0.00451 ** 
## TotalBsmtSF  8.025e+00  4.234e+00   1.895  0.05824 .  
## X1stFlrSF    1.137e+01  4.686e+00   2.426  0.01537 *  
## GarageArea   4.942e+00  9.823e+00   0.503  0.61497    
## ExterQual   -1.543e+04  1.886e+03  -8.182 6.06e-16 ***
## BsmtFinSF1   2.079e+01  2.468e+00   8.425  < 2e-16 ***
## BsmtQual    -4.999e+03  1.053e+03  -4.746 2.28e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36040 on 1449 degrees of freedom
## Multiple R-squared:  0.7956, Adjusted R-squared:  0.7942 
## F-statistic: 564.1 on 10 and 1449 DF,  p-value: < 2.2e-16

Predict some using the linear model of selected features.

predicted_pricesTrain <- predict(model, newdata = modifiedTrain)
head(predicted_pricesTrain)
##        1        2        3        4        5        6 
## 221934.2 178414.8 222164.1 187881.6 278822.1 154281.5
predicted_prices <- predict(model, newdata = modifiedTest)
head(predicted_prices)
##        1        2        3        4        5        6 
## 104347.4 156864.3 170980.1 172836.1 216285.1 168726.6

Plot of linear regression with with random-forest selected features.

 plot(predicted_pricesTrain ,modifiedTrain$SalePrice,
      xlab="Predicted Prices",ylab="Actual Prices", main = "Linear Regression with Random-Forest")
 abline(a=0,b=1)

Partition Data

set.seed(100)

index = sample(1:nrow(modifiedTrain), 0.7*nrow(modifiedTrain))

train1 = modifiedTrain[index, ] #training
test1 = modifiedTrain[-index, ] #test data

dim(train1)
## [1] 1021   81
dim(test1)
## [1] 439  81

Scale numeric

library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.3
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.0-2
cols = c("OverallQual", "GrLivArea", "GarageCars","YearBuilt","TotalBsmtSF", "X1stFlrSF", "GarageArea", "ExterQual", "BsmtFinSF1", "BsmtQual")

pre_proc_val <- preProcess(modifiedTrain[,cols], method = c("center", "scale"))

train1[,cols] = predict(pre_proc_val, train1[,cols])
test1[,cols] = predict(pre_proc_val, test1[,cols])

Regularize coefficients

cols_reg = c("OverallQual", "GrLivArea", "GarageCars","YearBuilt","TotalBsmtSF", "X1stFlrSF", "GarageArea", "ExterQual", "BsmtFinSF1", "BsmtQual", "SalePrice")

dummies <- dummyVars(SalePrice ~ ., data = modifiedTrain[,cols_reg])

train_dummies = predict(dummies, newdata = train1[,cols_reg])
test_dummies = predict(dummies, newdata = test1[,cols_reg])

print(dim(train_dummies)); print(dim(test_dummies))
## [1] 1021   10
## [1] 439  10

Develop regularized lasso regression to find best lambda

x <- as.matrix(train_dummies)
y_train <- train1$SalePrice

x_test <- as.matrix(test_dummies)
y_test <- test1$SalePrice

lambdas <- 10^seq(2, -3, by = -0.1)

lasso_reg <- cv.glmnet(x, y_train, alpha = 1, lambda = lambdas, standardize = TRUE, nfolds = 5)
lambda_best <- lasso_reg$lambda.min
lambda_best
## [1] 100

Model lasso regression

lasso_model <- glmnet(x, y_train, alpha = 1, lambda = lambda_best, standardize = TRUE)

eval_results <- function(true, predicted, df) {
  SSE <- sum((predicted - true)^2)
  SST <- sum((true - mean(true))^2)
  R_square <- 1 - SSE / SST
  RMSE = sqrt(SSE/nrow(df))

# Model performance metrics
data.frame(
  RMSE = RMSE,
  Rsquare = R_square
)
  
}

coef(lasso_model)
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                      s0
## (Intercept) 181573.1438
## OverallQual  25470.8587
## GrLivArea    25464.2439
## GarageCars   10445.4066
## YearBuilt     2778.1501
## TotalBsmtSF    245.1876
## X1stFlrSF     5213.0791
## GarageArea       .     
## ExterQual    -9925.5324
## BsmtFinSF1    9668.4035
## BsmtQual     -6005.1565
predictions_train <- predict(lasso_model, s = lambda_best, newx = x)
eval_results(y_train, predictions_train, modifiedTrain)
predictions_test <- predict(lasso_model, s = lambda_best, newx = x_test)
eval_results(y_test, predictions_test, modifiedTest)

Plot of lasso regression with with random-forest selected features.

 plot(predictions_train ,train1$SalePrice,
      xlab="Predicted Prices",ylab="Actual Prices", main = "Lasso Regression with Random-Forest")
 abline(a=0,b=1)

Make lasso regression letting its own algorithm determine features/attributes.

set.seed(100)

index = sample(1:nrow(modifiedTrain), 0.7*nrow(modifiedTrain))

train2 = modifiedTrain[index, ] #training
test2 = modifiedTrain[-index, ] #test data

pre_proc_val1 <- preProcess(modifiedTrain, method = c("center", "scale"))

train2 = predict(pre_proc_val, train2)
test2 = predict(pre_proc_val1, test2)


dummies <- dummyVars(SalePrice ~ ., data = modifiedTrain)

train_dummies1 = predict(dummies, newdata = train2)
test_dummies1 = predict(dummies, newdata = test2)


x <- as.matrix(train_dummies1)
y_train <- train2$SalePrice

x_test <- as.matrix(test_dummies1)
y_test <- test2$SalePrice

lambdas <- 10^seq(2, -3, by = -0.1)

lasso_reg <- cv.glmnet(x, y_train, alpha = 1, lambda = lambdas, standardize = TRUE, nfolds = 5)
lambda_best <- lasso_reg$lambda.min
lambda_best
## [1] 100
lasso_model <- glmnet(x, y_train, alpha = 1, lambda = lambda_best, standardize = TRUE)


eval_results <- function(true, predicted, df) {
  SSE <- sum((predicted - true)^2)
  SST <- sum((true - mean(true))^2)
  R_square <- 1 - SSE / SST
  RMSE = sqrt(SSE/nrow(df))

  
# Model performance metrics
data.frame(
  RMSE = RMSE,
  Rsquare = R_square
)
  
}

coef(lasso_model)
## 81 x 1 sparse Matrix of class "dgCMatrix"
##                          s0
## (Intercept)    2.254367e+06
## Id            -5.780810e-01
## MSSubClass    -1.310120e+02
## MSZoning      -1.546349e+03
## LotFrontage   -1.120779e+02
## LotArea        5.199583e-01
## Street         3.725504e+04
## Alley          1.699418e+03
## LotShape      -7.913624e+02
## LandContour    1.314962e+02
## Utilities     -4.920764e+04
## LotConfig      .           
## LandSlope      1.547381e+03
## Neighborhood   1.228866e+02
## Condition1     6.656055e+01
## Condition2    -2.651524e+03
## BldgType      -1.599719e+03
## HouseStyle    -6.546593e+02
## OverallQual    1.618188e+04
## OverallCond    4.157100e+03
## YearBuilt      3.031376e+03
## YearRemodAdd  -1.051323e+01
## RoofStyle      3.415164e+03
## RoofMatl       1.466345e+02
## Exterior1st   -1.119925e+03
## Exterior2nd    1.061778e+02
## MasVnrType     4.352602e+03
## MasVnrArea     2.899631e+01
## ExterQual     -6.089289e+03
## ExterCond      1.036432e+03
## Foundation     6.866980e+02
## BsmtQual      -5.201855e+03
## BsmtCond       3.788160e+02
## BsmtExposure  -3.393524e+03
## BsmtFinType1  -6.984326e+02
## BsmtFinSF1     4.904043e+03
## BsmtFinType2   4.720977e+02
## BsmtFinSF2     1.014427e+01
## BsmtUnfSF     -9.173713e-01
## TotalBsmtSF    .           
## Heating       -2.631787e+03
## HeatingQC     -1.477152e+03
## CentralAir     9.262570e+02
## Electrical    -2.020612e+02
## X1stFlrSF      2.621791e+03
## X2ndFlrSF      .           
## LowQualFinSF  -5.772143e+01
## GrLivArea      2.360197e+04
## BsmtFullBath   6.506220e+03
## BsmtHalfBath  -3.093603e+03
## FullBath       4.196272e+03
## HalfBath       1.866377e+03
## BedroomAbvGr  -5.424431e+03
## KitchenAbvGr  -1.602417e+04
## KitchenQual   -7.586985e+03
## TotRmsAbvGrd   4.296537e+03
## Functional     5.454336e+03
## Fireplaces     3.426877e+03
## FireplaceQu   -8.730977e+02
## GarageType     6.435110e+02
## GarageYrBlt   -7.301378e+00
## GarageFinish  -8.454088e+02
## GarageCars     9.386543e+03
## GarageArea     6.206339e+02
## GarageQual    -5.972414e+02
## GarageCond     7.553067e+02
## PavedDrive     2.581035e+03
## WoodDeckSF     2.243428e+01
## OpenPorchSF    2.197549e+01
## EnclosedPorch -4.192071e+01
## X3SsnPorch     9.838724e+00
## ScreenPorch    4.627182e+01
## PoolArea      -1.065078e+03
## PoolQC        -2.490499e+05
## Fence          .           
## MiscFeature   -3.243666e+03
## MiscVal       -1.424219e-01
## MoSold        -3.793662e+01
## YrSold        -5.456195e+02
## SaleType      -8.879488e+02
## SaleCondition  3.325754e+03
predictions_train <- predict(lasso_model, s = lambda_best, newx = x)
eval_results(y_train, predictions_train, modifiedTrain)
predictions_test <- predict(lasso_model, s = lambda_best, newx = x_test)
eval_results(y_test, predictions_test, modifiedTest)

Plot of lasso regression with all attributes.

 plot(predictions_train ,train2$SalePrice,
      xlab="Predicted Prices",ylab="Actual Prices", main = "Lasso Regression - All Attributes")
 abline(a=0,b=1)